#135 #141 2 deduplication fixes for the website agent

Fix 1) #135 update events' expiry date on deduplication

Previously events would age out even when being de-duplicated
frequently, resulting in spurious identical events making their way to
the next agent in the pipeline. This isn't the most efficient code (same
as previously), but not worth optimizing further until we've decided
what to do with a more generic deduplication module

Fix 2) #135 set deduplication "look back" to be configurable (and
default to a large number)

I think this is preferable default behavior? Potential performance
issues should be addressed via an efficient deduplication library using
indexed DB fields

Alex Piggott 10 anos atrás
pai
commit
7b38df61ed
2 arquivos alterados com 56 adições e 10 exclusões
  1. 25 7
      app/models/agents/website_agent.rb
  2. 31 3
      spec/models/agents/website_agent_spec.rb

+ 25 - 7
app/models/agents/website_agent.rb

@@ -34,7 +34,9 @@ module Agents
34 34
 
35 35
       Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
36 36
 
37
-      Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
37
+      Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent (only used to set the "working" status).
38
+
39
+      Set `uniqueness_look_back` (defaults to 10000) to limit the number of events checked for uniqueness (typically for performance).
38 40
     MD
39 41
 
40 42
     event_description do
@@ -43,7 +45,7 @@ module Agents
43 45
 
44 46
     default_schedule "every_12h"
45 47
 
46
-    UNIQUENESS_LOOK_BACK = 30
48
+    UNIQUENESS_LOOK_BACK = 10000
47 49
 
48 50
     def working?
49 51
       event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
@@ -82,10 +84,11 @@ module Agents
82 84
       end
83 85
       request.on_success do |response|
84 86
         doc = parse(response.body)
87
+        old_events = previous_payloads
85 88
 
86 89
         if extract_full_json?
87 90
           result = doc
88
-          if store_payload? result
91
+          if store_payload? old_events, result
89 92
             log "Storing new result for '#{name}': #{result.inspect}"
90 93
             create_event :payload => result
91 94
           end
@@ -125,7 +128,7 @@ module Agents
125 128
               end
126 129
             end
127 130
 
128
-            if store_payload? result
131
+            if store_payload? old_events, result
129 132
               log "Storing new parsed result for '#{name}': #{result.inspect}"
130 133
               create_event :payload => result
131 134
             end
@@ -138,12 +141,27 @@ module Agents
138 141
 
139 142
     private
140 143
 
141
-    def store_payload? result
142
-      !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json))
144
+    def store_payload?(old_events, result)
145
+      if !options['mode']
146
+        return true
147
+      elsif options['mode'].to_s == "all"
148
+        return true
149
+      elsif options['mode'].to_s == "on_change"
150
+        old_events.each do |old_event|
151
+          if old_event.payload.to_json == result.to_json
152
+            old_event.expires_at = new_event_expiration_date
153
+            old_event.save
154
+            return false
155
+         end
156
+        end
157
+        return true
158
+      end
159
+      raise "Illegal options[mode]: " + options['mode'].to_s
143 160
     end
144 161
 
145 162
     def previous_payloads
146
-      events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
163
+      look_back = options['uniqueness_look_back'] ? options['uniqueness_look_back'].to_i : UNIQUENESS_LOOK_BACK
164
+      events.order("id desc").limit(look_back) if options['mode'].to_s == "on_change"
147 165
     end
148 166
 
149 167
     def extract_full_json?

+ 31 - 3
spec/models/agents/website_agent_spec.rb

@@ -15,15 +15,19 @@ describe Agents::WebsiteAgent do
15 15
           'title' => {'css' => "#comic img", 'attr' => "title"}
16 16
         }
17 17
       }
18
-      @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site)
18
+      @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2)
19 19
       @checker.user = users(:bob)
20 20
       @checker.save!
21 21
     end
22 22
 
23 23
     describe "#check" do
24
-      it "should check for changes" do
24
+      it "should check for changes (and update Event.expires_at)" do
25 25
         lambda { @checker.check }.should change { Event.count }.by(1)
26
+        event = Event.last
27
+        sleep 2
26 28
         lambda { @checker.check }.should_not change { Event.count }
29
+        update_event = Event.last
30
+        update_event.expires_at.should_not == event.expires_at
27 31
       end
28 32
 
29 33
       it "should always save events when in :all mode" do
@@ -35,6 +39,30 @@ describe Agents::WebsiteAgent do
35 39
         }.should change { Event.count }.by(2)
36 40
       end
37 41
 
42
+      it "should take uniqueness_look_back into account during deduplication" do
43
+        @site['mode'] = 'all'
44
+        @checker.options = @site
45
+        @checker.check
46
+        @checker.check
47
+        event = Event.last
48
+        event.payload = "{}"
49
+        event.save
50
+
51
+        lambda {
52
+          @site['mode'] = 'on_change'
53
+          @site['uniqueness_look_back'] = 2
54
+          @checker.options = @site
55
+          @checker.check
56
+        }.should_not change { Event.count }
57
+
58
+        lambda {
59
+          @site['mode'] = 'on_change'
60
+          @site['uniqueness_look_back'] = 1
61
+          @checker.options = @site
62
+          @checker.check
63
+        }.should change { Event.count }.by(1)
64
+      end
65
+
38 66
       it "should log an error if the number of results for a set of extraction patterns differs" do
39 67
         @site['extract']['url']['css'] = "div"
40 68
         @checker.options = @site
@@ -217,4 +245,4 @@ describe Agents::WebsiteAgent do
217 245
     end
218 246
   end
219 247
 
220
-end
248
+end